Outline:
In [ ]:
import json
from etk.etk import ETK
from etk.extractors.glossary_extractor import GlossaryExtractor
from etk.etk_module import ETKModule
In [23]:
etk = ETK()
In [24]:
sample_input = {
"projects": [
{
"name": "etk",
"description": "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others."
},
{
"name": "rltk",
"description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students."
}
]
}
In [25]:
doc = etk.create_document(sample_input)
In [26]:
name_extractor = GlossaryExtractor(etk.load_glossary("./examples/hello_world/names.txt"), "name_extractor",
etk.default_tokenizer,
case_sensitive=False, ngrams=1)
In [27]:
descriptions = doc.select_segments("projects[*].description")
projects = doc.select_segments("projects[*]")
In [28]:
for d, p in zip(descriptions, projects):
names = doc.extract(name_extractor, d)
p.store(names, "members")
In [29]:
print(json.dumps(doc.value, indent=2))
In [30]: